#   LibAut
#       joint onset & outcome regressions
#       Table 1, columns 2-3, Table C1
#   Juraj Medzihorsky
#   2021-10-29

library(arm)
library(GJRM)
library(parallel)
options(mc.cores=2)
options(stringsAsFactors=F)
source('helpers.R')

#   sessionInfo()
##  R version 3.6.3 (2020-02-29)
##  Platform: x86_64-pc-linux-gnu (64-bit)
##  Running under: Ubuntu 16.04.7 LTS
##
##  Matrix products: default
##  BLAS/LAPACK: /opt/OpenBLAS/lib/libopenblas_haswellp-r0.3.9.so
##
##  locale:
##   [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
##   [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_GB.UTF-8       LC_NAME=C
##   [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
##  attached base packages:
##  [1] parallel  stats     graphics  grDevices utils     datasets  methods   base
##
##  other attached packages:
##  [1] GJRM_0.2-5.1   mgcv_1.8-28    nlme_3.1-152   arm_1.9-3      lme4_1.1-18-1  Matrix_1.3-4   MASS_7.3-54
##  [8] nvimcom_0.9-28 colorout_1.2-0
##
##  loaded via a namespace (and not attached):
##   [1] Rcpp_1.0.7          ADGofTest_0.3       mvtnorm_1.0-8       lattice_0.20-45     gmp_0.5-13.2
##   [6] assertthat_0.2.1    psych_1.8.10        foreach_1.4.4       utf8_1.2.1          distr_2.7.0
##  [11] R6_2.5.0            magic_1.5-9         stats4_3.6.3        pcaPP_1.9-73        survey_3.34
##  [16] coda_0.19-4         ggplot2_3.3.3       pillar_1.6.1        rlang_0.4.11        minqa_1.2.4
##  [21] scam_1.2-6          nloptr_1.2.1        ismev_1.42          splines_3.6.3       foreign_0.8-76
##  [26] munsell_0.5.0       compiler_3.6.3      numDeriv_2016.8-1.1 pkgconfig_2.0.3     gsl_1.9-10.3
##  [31] mnormt_1.5-5        gamlss.dist_5.1-1   evd_2.3-3           tidyselect_1.1.1    tibble_3.1.2
##  [36] VineCopula_2.1.8    codetools_0.2-18    matrixStats_0.54.0  stabledist_0.7-1    fansi_0.5.0
##  [41] pspline_1.0-18      crayon_1.4.1        dplyr_1.0.6         grid_3.6.3          gtable_0.3.0
##  [46] lifecycle_1.0.0     DBI_1.0.0           magrittr_2.0.1      scales_1.1.1        Rmpfr_0.7-2
##  [51] distrEx_2.7.0       doParallel_1.0.14   ellipsis_0.3.2      generics_0.1.0      vctrs_0.3.8
##  [56] trust_0.1-7         iterators_1.0.10    tools_3.6.3         copula_0.999-19     glue_1.4.2
##  [61] purrr_0.3.4         sfsmisc_1.1-3       network_1.13.0.1    abind_1.4-5         survival_2.44-1.1
##  [66] colorspace_1.3-2    VGAM_1.0-6          startupmsg_0.9.5


#------------------------------------------------------------------------------

code_dir <- getwd()
data_dir <- gsub('code$', 'data', code_dir)
figs_dir <- gsub('code$', 'figures', code_dir)
tabs_dir <- gsub('code$', 'tables', code_dir)


setwd(data_dir)
dat <- readRDS('onset_20210810.rds')
eps <- read.csv('eps.csv')
setwd(code_dir)

dat <- subset(dat, onset_eligible%in%1)

#------------------------------------------------------------------------------
#   get EP shares
tab_dem <- aggregate(eps$dem_ep, by=list(eps$year), FUN=mean, na.rm=T)
tab_aut <- aggregate(eps$aut_ep, by=list(eps$year), FUN=mean, na.rm=T)
names(tab_dem) <- c('year', 'share_dem_ep')
names(tab_aut) <- c('year', 'share_aut_ep')

tab <- merge(tab_dem, tab_aut, by='year')

tab_lag <- tab
tab_lag$year <- tab$year + 1
names(tab_lag)[2:3] <- paste0('lag1_', names(tab)[2:3])

tab_tab <- merge(tab, tab_lag)

dat <- merge(dat, tab_tab, by='year')

#------------------------------------------------------------------------------
#   get outcome

#   recode outcomes
#   epy: suc 1 fail 0
dat$epy <- as.numeric(rep(NA,nrow(dat)))
#   success: 1, 5
dat$epy[dat$dem_ep_outcome%in%c(1,5)] <- 1
#   fail: 2,3,4
dat$epy[dat$dem_ep_outcome%in%c(2,3,4)] <- 0
#   censored: 6
dat$epy[dat$dem_ep_outcome%in%6] <- NA

#   with(dat, table(epy, dem_ep_outcome, useNA='a'))
#   sum(!is.na(dat$epy))

#------------------------------------------------------------------------------

dat$logpop <- log(dat$e_mipopula) 

dat$reg_fac <- as.factor(dat$e_regionpol_6C)

#   with(dat, table(reg_fac, e_regionpol_6C))

reglabs <- c('Eastern Europe & Central Asia',
             'Latin America &the Caribbean',
             'MENA, ISR, TUR',
             'Sub-Saharan Africa',
             'Western Europe, North America, CYP, AUS, NZ',
             'Asia & Pacific')

#------------------------------------------------------------------------------
#   models

#   Model 1: year cubic polynomial (reported in the paper)
#   Model 2: year gaussian process smooth (not reported in the paper)

#   formulas
y_0 <- epy ~ 1
s_0 <- libaut_start ~ 1
#   
lin_1 <- ~ . + 
    reg_fac +
    lag1_e_migdppcln + 
    lag1_e_migdpgro + 
    lag1_logpop +
    lag1_v2pepwrsoc + 
    lag1_v2xeg_eqdr + 
    lag1_v2xnp_pres + 
    lag1_v2csprtcpt +
    lag1_v2x_polyarchy +
    lag1_excl_region_edi + 
    lag1_e_miinteco + 
    lag1_e_miinterc 
#
lin_year <- ~ . + I((1e-1*(year-1990))) + I((1e-1*(year-1990))^2) + I((1e-1*(year-1990))^3) 
gam_year <- ~ . + s(year, bs="gp")
#
lin_sel <- ~ . + lag1_share_dem_ep + lag1_share_aut_ep
gam_sel <- ~ . + s(lag1_share_dem_ep, bs="gp") + s(lag1_share_aut_ep, bs="gp")
#
y_l_1 <- update(y_0, lin_1)
y_l_2 <- update(y_l_1, lin_year)
#
s_l_1 <- update(update(s_0, lin_1), lin_sel)
s_l_2 <- update(update(s_l_1, lin_year), lin_sel)


#   bernoulli-logistic (x2) GJRM with gaussian copula
system.time(
m_l_1 <- gjrm(list(s_l_1, y_l_1),
              data=dat, Model='BSS',
              gamlssfit=T, extra.regI='t', 
              margin=c('logit', 'logit'), BivD='N')
)

#   bernoulli-logistic (x2) GJRM with gaussian copula
system.time(
m_l_2 <- gjrm(list(s_l_2, y_l_2),
              data=dat, Model='BSS',
              gamlssfit=T, extra.regI='t', 
              margin=c('logit', 'logit'), BivD='N')
)


#------------------------------------------------------------------------------
#   tabulate

ms1 <- summary(m_l_1$gam1)$p.table
ms2 <- summary(m_l_2$gam1)$p.table

my1 <- summary(m_l_1$gam2)$p.table
my2 <- summary(m_l_2$gam2)$p.table

bb <- matrix(ncol=4, nrow=nrow(ms2))
bb[,] <- NA
rownames(bb) <- rownames(ms2)
bb[rownames(ms1),1] <- ms1[,1]
bb[rownames(my1),2] <- my1[,1]
bb[rownames(ms2),3] <- ms2[,1]
bb[rownames(my2),4] <- my2[,1]


bt1 <- paste(apply(round(bb[,1:2],2), 1, paste, collapse=' & '), '&&')
bt2 <- paste(apply(round(bb[,3:4],2), 1, paste, collapse=' & '), '\\\\')
bt <- apply(cbind(bt1,bt2), 1, paste, collapse='')

getci <-
    function(x)
    {
        ci <- round(data.frame(lo=x[,1]-1.96*x[,2], hi=x[,1]+1.96*x[,2]),2)
        y <- apply(ci, 1, paste0, collapse=', ')
        r <- paste0('(',y,')')
        names(r) <- rownames(ci)
        return(r)
    }


cc <- matrix(ncol=4, nrow=nrow(ms2))
cc[,] <- NA
rownames(cc) <- rownames(ms2)
cc[rownames(ms1),1] <- getci(ms1)
cc[rownames(my1),2] <- getci(my1)
cc[rownames(ms2),3] <- getci(ms2)
cc[rownames(my2),4] <- getci(my2)

ct1 <- paste(apply(cc[,1:2], 1, paste, collapse=' & '), '&&')
ct2 <- paste(apply(cc[,3:4], 1, paste, collapse=' & '), '\\\\')
ct <- apply(cbind(ct1,ct2), 1, paste, collapse='')

bt <- gsub('-', '$-$', bt)
ct <- gsub('-', '$-$', ct)
bt <- gsub('NA', '', bt)
ct <- gsub('NA', '', ct)

getaic <- function(x) round(AIC(x),2)

getauc <-
    function(x)
    {
        d <- x$model
        f <- fitted(x, type='response')
        round(getROC_AUC(f, d[,1])$auc, 2)
    }

getcc <-
    function(x)
    {
        d <- x$model
        f <- fitted(x, type='response')
        round(mean(round(f)==d[,1]), 2)
    }


aics <- paste0('Akaike Information Criterion & ',
               paste0(getaic(m_l_1$gam1), ' & ', getaic(m_l_1$gam2)), ' && ',
               paste0(getaic(m_l_2$gam1), ' & ', getaic(m_l_2$gam2)), ' \\\\')

aucs <- paste0('AUC & ',
               paste0(getauc(m_l_1$gam1), ' & ', getauc(m_l_1$gam2)), ' && ',
               paste0(getauc(m_l_2$gam1), ' & ', getauc(m_l_2$gam2)), ' \\\\')

ccs <- paste0('CC & ',
              paste0(getcc(m_l_1$gam1), ' & ', getcc(m_l_1$gam2)), ' && ',
              paste0(getcc(m_l_2$gam1), ' & ', getcc(m_l_2$gam2)), ' \\\\')

thetas <- paste0('$\\theta$ & \\multicolumn{2}{c}{', 
                 round(summary(m_l_1)$theta, 2), 
                 '} && \\multicolumn{2}{c}{',
                 round(summary(m_l_2)$theta, 2), 
                 '}\\\\') 

citheta <- paste0(' & \\multicolumn{2}{c}{(', 
                  paste0(round(summary(m_l_1)$CItheta[1], 2), ', '),
                  paste0(round(summary(m_l_1)$CItheta[2], 2), ')'),
                  '} && \\multicolumn{2}{c}{',
                  paste0(round(summary(m_l_2)$CItheta[1], 2), ', '),
                  paste0(round(summary(m_l_2)$CItheta[2], 2), ')'),
                  '}\\\\') 

yearline <- 'Year cubic polynomial & \\multicolumn{2}{c}{No} & \\multicolumn{2}{c}{Yes}\\\\'


tab <- character(length=4*length(bt))
for (i in 1:length(bt)) {
    ii <- (i-1)*3 + i
    tab[ii] <- rownames(bb)[i]
    tab[ii+1] <- paste0(' & ', bt[i])
    tab[ii+2] <- paste0(' & ', ct[i])
}
tab <- c(tab, yearline, thetas, citheta, aics, aucs, ccs)

setwd(tabs_dir)
writeLines(tab, con='Table_C1.tex')

#   SCRIPT END
